import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
%matplotlib inline
pd.set_option('display.max_columns', 100)
import missingno as msno
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly
import cv2
import os
from os import path
from PIL import Image
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import time
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("Flipkart/data_cleaned.csv", index_col=0)
df_tsne_vgg = pd.read_csv("Flipkart/im_features_vgg.csv", index_col=0)
df_sift = pd.read_csv("Flipkart/im_features_sift.csv", index_col=0)
df = pd.read_csv("Flipkart/ari_im.csv", index_col=0)
df_use = pd.read_csv("Flipkart/df_use.csv", index_col=0)
text_features_final = df_use
image_features_sift = df_sift
all_features_sift = pd.concat([text_features_final, image_features_sift], axis=1)
print("text_features shape :", text_features_final.shape)
print("image_features shape :", image_features_sift.shape)
print("all_features shape :", all_features_sift.shape)
def pca(vector):
pca = PCA(n_components=0.99)
ft_pca = pca.fit_transform(vector)
return ft_pca
pca_all_sift = pca(all_features_sift)
tsne = TSNE(n_components=2, verbose=1, perplexity=80,n_iter=5000, learning_rate=200, random_state=42)
X_tsne_siftt = tsne.fit_transform(pca_all_sift)
df_tsne_sift = pd.DataFrame(X_tsne_siftt, columns=['tsne1', 'tsne2'])
print(df_tsne_sift.shape)
list_ari = []
def plot_kmeans_tsne(reduction, title, filename, colname):
kmeans_tsne = KMeans(n_clusters=7, n_init=50, max_iter=200,init='k-means++', random_state=42).fit(reduction)
labels_tsne = kmeans_tsne.labels_
cl_tsne = pd.concat([reduction,pd.DataFrame({'tsne_clusters':labels_tsne})],axis=1)
data[f'cluster {colname}'] = labels_tsne
categories_predict = data[f'cluster {colname}']
categories_true = data['product_category_1']
adjusted_rand = metrics.adjusted_rand_score(categories_true, categories_predict)
list_ari.append(adjusted_rand)
print("\033[1mAdjusted Rand Index: %0.3f\033[0m" % adjusted_rand)
fig = px.scatter(data, x=cl_tsne.iloc[:,0], y = cl_tsne.iloc[:,1], color=categories_true, title=f"Représentation selon les vraies classes {title}")
fig1 = px.scatter(data, x = cl_tsne.iloc[:,0],y = cl_tsne.iloc[:,1], color=categories_predict, title = f"Représentation selon les clusters {title}")
plotly.offline.plot(fig, filename=f'plots/{filename}.html')
plotly.offline.plot(fig1, filename=f'plots/{filename}_cluster.html')
return fig.show(), fig1.show()
plot_kmeans_tsne(df_tsne_sift, "Clusters Sift lemmatize", "Sift_lemmatize", "sift_lemmatize")
fig, ax = plt.subplots(figsize=(15,15))
plt.title('Visualisation Prédiction avec entrainement Texte et Image', fontweight='bold')
ax.scatter(X_tsne_siftt[data.index, 0], X_tsne_siftt[data.index, 1])
for x0, y0, path in zip(X_tsne_siftt[data.index, 0], X_tsne_siftt[data.index, 1],
(f'Flipkart/Images/' + data.image)):
ab = AnnotationBbox(OffsetImage(plt.imread(path), zoom=0.025), (x0, y0), frameon=False)
ax.add_artist(ab)
plt.xlabel('TSNE 1')
plt.ylabel('TSNE 2')
plt.show()
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster sift_lemmatize'] == x].index
for x in data['cluster sift_lemmatize'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
image_features_VGG = df_tsne_vgg
all_features_VGG = pd.concat([text_features_final, image_features_VGG], axis=1)
print("text_features shape :", text_features_final.shape)
print("image_features shape :", image_features_VGG.shape)
print("all_features shape :", all_features_VGG.shape)
pca_all_VGG = pca(all_features_VGG)
X_tsne_VGG = tsne.fit_transform(pca_all_VGG)
df_tsne_VGG = pd.DataFrame(X_tsne_VGG, columns=['tsne1', 'tsne2'])
print(df_tsne_VGG.shape)
plot_kmeans_tsne(df_tsne_VGG, "Clusters VGG lemmatize", "vgg_lemmatize", "vgg_lemmatize")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster vgg_lemmatize'] == x].index
for x in data['cluster vgg_lemmatize'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
df_ari=pd.DataFrame([list_ari]
,columns=['sift_lem','vgg_lem'],
index=['ARI_SCORE'])
df_ari.T.round(2).plot(kind="bar",figsize=(10,6))
plt.xlabel("Model")
plt.ylabel("ARI Score")
df = df.join(df_ari, how="inner")
df.T.round(2).plot(kind="bar",figsize=(10,6))
plt.xlabel("Model")
plt.ylabel("ARI Score")
# Create crosstab: ct
ct = pd.crosstab(data['product_category_1'], data['cluster vgg_lemmatize'])
# plot the heatmap
plt.figure(figsize = (10,7))
ax = sns.heatmap(ct, annot=True, fmt='g')
ax.set_xlabel('Clusters')
ax.set_ylabel('Categories')
# fix a displaying problem
ax.set_ylim(ax.get_ylim()[0]+0.5, ax.get_ylim()[1]-0.5);
Certaines catégories ont leurs produits qui sont correctements classés. D'autres catégories n'ont pas pu être associées clairement à un cluster comme la catégorie "Home Furnishing ou la catégorie "Kitchen & Dining".
Il est impossible d'associer avec certitude une catégorie à un numéro de cluster.
fig, ax = plt.subplots(figsize=(15,15))
plt.title('Visualisation Prédiction avec entrainement Texte et Image', fontweight='bold')
ax.scatter(X_tsne_VGG[data.index, 0], X_tsne_VGG[data.index, 1])
for x0, y0, path in zip(X_tsne_VGG[data.index, 0], X_tsne_VGG[data.index, 1],
(f'Flipkart/Images/' + data.image)):
ab = AnnotationBbox(OffsetImage(plt.imread(path), zoom=0.025), (x0, y0), frameon=False)
ax.add_artist(ab)
plt.xlabel('TSNE 1')
plt.ylabel('TSNE 2')
plt.show()
La classification à partir des données textes (description) lemmatiser et images avec traitement vgg donne des résultats satisfaisants.